# R packages
library("corrr") # 0.4.2
library("cowplot") # 1.0.0
library("dplyr") # 1.0.0
library("DT") #0.14
library("ggplot2") # 3.3.1
library("oem") # 2.0.10
library("gridExtra") # 2.3
library("knitr") # 1.28
library("kableExtra") # 1.1.0
library("magrittr") # 1.5
library("mice") # 3.9.0
library("pROC") # 1.16.2
library("purrr") # 0.3.4
library("rcompanion") # 2.3.25
library("rms") # 6.0.1
library("splines") # base
library("tableone") # 12.0.0
library("tibble") # 3.0.1
library("tidyr") # 1.1.0
# Namespace clash
select <- dplyr::select
# Settings
set.seed(123)
theme_set(theme_bw())
n_imputations <- 2 # Number of multiple imputations with MICE
n_boot_val <- 40 # Number of bootstraps with rms validation and calibration
n_boot_probs <- 100 # Number of bootstraps for computing predicted probabilitiy CIs
n_rep <- 2 # Number of repeats with grouped lasso
CACHE <- TRUE # Whether to use cached results for long computations
We begin by loading the “training” dataset and restrict to (i) patients age 18 and older and (ii) with an index date more than 2 weeks prior to the data release. Note that we are not unfortunately allowed to publicly share this data.
train_data <- readRDS("train_data.rds") %>%
filter(age >= 18) %>%
filter(as.Date("2020-06-05") - index_date > 14)
# Recoding
train_data <- train_data %>%
mutate(
## "died" in add_deaths() should be an integer
died = as.integer(died),
## Convert unknown or other/unknown to missing
race = ifelse(race == "Other/Unknown", NA, race),
ethnicity = ifelse(ethnicity == "Unknown", NA, ethnicity),
sex = ifelse(sex == "Unknown", NA, sex),
## Oxygen saturation should have plausible values
spo2 = ifelse(spo2 == 0, NA_real_, spo2),
spo2 = ifelse(spo2 > 100, 100, spo2)
) %>%
## Add option for comorbidities in create_comorbidities() to be character
rename(diabunc = diab) %>%
mutate_at(
c("ami", "chf", "pvd", "cevd", "dementia", "copd", "rheumd", "pud",
"mld", "diabunc", "diabwc", "hp", "rend", "canc", "msld", "metacanc",
"aids", "hypc", "hypunc"),
function (x) ifelse(x == 1, "Yes", "No")
)
# Create "derived" variables
## Function to add race/ethnicity variable to dataset (done after separately
## imputing missing race + ethnicity information)
add_race_ethnicity <- function(data){
data %>% mutate(
race_ethnicity = case_when(
race == "Caucasian" & ethnicity != "Hispanic" ~ "Non-Hispanic white",
race == "African American" & ethnicity != "Hispanic" ~ "Non-Hispanic black",
race == "Asian" & ethnicity != "Hispanic" ~ "Asian",
is.na(race) | is.na(ethnicity) ~ NA_character_,
TRUE ~ "Hispanic"
),
race_ethnicity = relevel(factor(race_ethnicity), ref = "Non-Hispanic white")
)
}
train_data <- train_data %>%
mutate(
calendar_time = as.numeric(index_date - min(index_date)),
index_month = as.integer(format(index_date, "%m")),
death_month = as.integer(format(date_of_death,"%m")),
os_days = pmax(0, date_of_death - index_date),
## Categorize BMI
bmi_cat = case_when(
bmi < 18.5 ~ "Underweight",
bmi >= 18.5 & bmi < 25 ~ "Normal",
bmi >= 25 & bmi < 30 ~ "Overweight",
bmi >= 30 ~ "Obese",
TRUE ~ NA_character_
),
## Combine comorbidities
diab = case_when(
diabunc == "Yes" | diabwc == "Yes" ~ "Yes",
TRUE ~ "No"
),
hyp = case_when(
hypc == "Yes" | hypunc == "Yes" ~ "Yes",
TRUE ~ "No"
)
) %>%
## Create race/ethnicity variable
## Later to be created after imputation from imputed race + ethnicity
add_race_ethnicity()
# Labels
## Categorical variables
demographic_cat_vars <- tribble(
~var, ~varlab,
"sex", "Sex",
"race", "Race",
"ethnicity", "Ethnicity",
"division", "Geographic division",
"smoke", "Smoking",
"race_ethnicity", 'Race/Ethnicity'
) %>%
mutate(group = "Demographics")
comorbidity_vars <- tribble(
~var, ~varlab,
"ami", "Acute myocardial infarction",
"chf", "Congestive heart failure",
"pvd", "Peripheral vascular disease",
"cevd", "Cerebrovascular disease",
"dementia", "Dementia",
"copd", "Chronic pulmonary disease",
"rheumd", "Rheumatoid disease",
"pud", "Peptic ulcer disease",
"mld", "Mild liver disease",
"diabunc", "Diabetes (no complications)",
"diabwc", "Diabetes (complications)",
"hp", "Hemiplegia or paraplegia",
"rend", "Renal disease",
"canc", "Cancer",
"msld", "Moderate/severe liver disease",
"metacanc", "Metastatic cancer",
"aids", "AIDS/HIV",
"hypunc", "Hypertension (no complications)",
"hypc", "Hypertension (complications)",
"diab", "Diabetes", # Combines diabunc and diabwc
"hyp", "Hypertension", # Combine hypunc and hypc
) %>%
mutate(group = "Comorbidities")
vital_cat_vars <- tribble(
~var, ~varlab,
"bmi_cat", "Body Mass Index (BMI)",
) %>%
mutate(group = "Vitals")
cat_vars <- bind_rows(demographic_cat_vars,
comorbidity_vars,
vital_cat_vars)
## Continuous variables
demographic_continuous_vars <- tribble(
~var, ~varlab,
"age", "Age",
"calendar_time", "Calendar time"
) %>%
mutate(group = "Demographics")
vital_continuous_vars <- tribble(
~var, ~varlab,
"bmi", "Body Mass Index (BMI)",
"dbp", "Diastolic blood pressure",
"sbp", "Systolic blood pressure",
"hr", "Heart rate",
"resp", "Respiration rate",
"temp", "Temperature",
) %>%
mutate(group = "Vitals")
lab_vars <- tribble(
~var, ~varlab,
"alt", "Alanine aminotransferase (ALT)",
"ast", "Aspartate aminotransferase (AST)",
"crp", "C-reactive protein (CRP)",
"creatinine", "Creatinine",
"ferritin", "Ferritin",
"d_dimer", "Fibrin D-Dimer",
"ldh", "Lactate dehydrogenase (LDH)",
"lymphocyte", "Lymphocyte count",
"neutrophil", "Neutrophil count",
"spo2", "Oxygen saturation",
"pct", "Procalcitonin",
"tni", "Troponin I",
"plt", "Platelet count (PLT)",
"wbc", "White blood cell count (WBC)"
) %>%
mutate(group = "Labs")
continuous_vars <- bind_rows(
demographic_continuous_vars,
vital_continuous_vars,
lab_vars
)
# All variables
vars <- bind_rows(cat_vars,
continuous_vars)
get_var_labs <- function(v){
vars$varlab[match(v, vars$var)]
}
Before starting modeling, it’s a good idea to carefully inspect the data.
train_data %>%
count(name = "Sample size") %>%
mutate(Data = "Optum training set") %>%
select(Data, `Sample size`) %>%
kable() %>%
kable_styling()
| Data | Sample size |
|---|---|
| Optum training set | 13666 |
missing_df <- train_data %>%
select(one_of(vars$var)) %>%
mutate_all(function (x) ifelse(is.na(x), 1, 0)) %>%
mutate(id = factor(1:n())) %>%
pivot_longer(cols = vars$var, names_to = "var", values_to = "missing") %>%
left_join(vars, by = "var")
# Compute proportion missing
prop_missing <- missing_df %>%
group_by(varlab) %>%
summarise(prop = mean(missing))
## `summarise()` ungrouping output (override with `.groups` argument)
# Plot
ggplot(prop_missing, aes(x = varlab, y = prop)) +
geom_bar(stat = "identity") +
geom_text(aes(label = formatC(prop, format = "f", digits = 2)),
nudge_y = .03, size = 3) +
ylim(c(0, 1)) +
xlab("") +
ylab("Proportion") +
coord_flip() +
scale_x_discrete(limits = rev(sort(vars$varlab)))
ggplot(missing_df,
aes(x = id, y = varlab, fill = factor(missing))) +
geom_raster() +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.y = element_blank(),
legend.position = "bottom") +
scale_fill_manual(name = "Missing",
values = c("lightgrey", "steelblue"),
labels = c("No", "Yes")) +
scale_y_discrete(limits = rev(sort(vars$varlab)))
## Warning: Raster pixels are placed at uneven vertical intervals and will be shifted. Consider using geom_tile() instead.
missing_df %>%
# Count of missing by patient
group_by(id) %>%
summarise(n_missing = sum(missing),
prop_missing = n_missing/n()) %>%
# Plot
ggplot(aes(x = prop_missing)) +
geom_histogram(binwidth = .03, color = "white") +
scale_x_continuous(breaks = seq(0, 1, .05)) +
scale_y_continuous(n.breaks = 20) +
xlab("Proportion of covariates that are missing") +
ylab("Count")
## `summarise()` ungrouping output (override with `.groups` argument)
cat_var_df <- train_data %>%
select(one_of("ptid", cat_vars$var)) %>%
pivot_longer(cols = cat_vars$var, names_to = "var", values_to = "value") %>%
left_join(cat_vars, by = "var") %>%
filter(!is.na(value)) %>%
group_by(var, varlab, value) %>%
summarise(n = n()) %>%
group_by(varlab) %>%
mutate(freq = n / sum(n)) %>%
ungroup() %>%
mutate(
nudge_x = case_when(
freq < 0.5 ~ 0.15,
TRUE ~ -0.15
)
)
## `summarise()` regrouping output by 'var', 'varlab' (override with `.groups` argument)
ggplot(cat_var_df,
aes(x = freq, y = value)) +
geom_point() +
geom_text(aes(label = formatC(freq, format = "f", digits = 2)),
nudge_x = cat_var_df$nudge_x, size = 3.5) +
facet_wrap(~varlab, scales = "free_y", ncol = 4) +
xlim(0, 1) +
xlab("Proportion") +
ylab("") +
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
strip.text.x = element_text(size = 7))
pivot_continuous_longer <- function(data, vars){
col_names <- vars$var
train_data %>%
select(one_of("ptid", col_names)) %>%
pivot_longer(cols = col_names,
names_to = "var",
values_to = "value") %>%
left_join(vars, by = "var") %>%
filter(!is.na(value))
}
continuous_var_df <- pivot_continuous_longer(train_data,
vars = continuous_vars)
plot_box <- function(data){
ggplot(data,
aes(x = varlab, y = value)) +
geom_boxplot(outlier.size = 1) +
facet_wrap(~varlab, scales = "free") +
xlab("") +
ylab("Value") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text = element_text(size = 7))
}
plot_box(continuous_var_df)
plot_hist <- function(data){
ggplot(data,
aes(x = value)) +
geom_histogram(bins = 40, color = "white") +
facet_wrap(~varlab, scales = "free", ncol = 4) +
xlab("") + ylab("Frequency") +
theme(strip.text = element_text(size = 7))
}
plot_hist(continuous_var_df)
Visual inspection of the box plots and histograms suggests that there are significant outliers in the labs. Let’s look at how many observations lie above the 99th percentile of the data and the “outer fence” (defined as the 3rd quartile plus 3 time the interquartile range). We will then create new lab variables truncated from above at the outer fence and replot the histograms.
outer_fence <- function(v){
q1 <- quantile(v, .25, na.rm = TRUE)
q3 <- quantile(v, .75, na.rm = TRUE)
iq <- (q3 - q1)
return(as.numeric(q3 + 3 * iq))
}
format_percent <- function(x){
paste0(formatC(100 * x, format = "f", digits = 1), "%")
}
train_data %>%
select(one_of(lab_vars$var)) %>%
pivot_longer(cols = lab_vars$var, names_to = "Lab") %>%
filter(!is.na(value)) %>%
group_by(Lab) %>%
summarise(Maximum = max(value),
`99%` = quantile(value, .99),
`Outer fence` = outer_fence(value),
`% above outer fence` = format_percent(mean(value > outer_fence(value)))) %>%
mutate(Lab = get_var_labs(Lab)) %>%
kable() %>%
kable_styling()
## `summarise()` ungrouping output (override with `.groups` argument)
| Lab | Maximum | 99% | Outer fence | % above outer fence |
|---|---|---|---|---|
| Alanine aminotransferase (ALT) | 3017.00 | 224.640000 | 130.000 | 3.4% |
| Aspartate aminotransferase (AST) | 7000.01 | 336.000000 | 157.000 | 3.6% |
| Creatinine | 27.35 | 10.878500 | 3.240 | 6.4% |
| C-reactive protein (CRP) | 638.00 | 359.030000 | 458.000 | 0.2% |
| Fibrin D-Dimer | 114475.00 | 19499.300000 | 4993.000 | 7.0% |
| Ferritin | 100000.01 | 7747.270000 | 3646.875 | 3.4% |
| Lactate dehydrogenase (LDH) | 14007.00 | 1262.520000 | 1049.500 | 1.9% |
| Lymphocyte count | 120.35 | 3.736700 | 3.500 | 1.2% |
| Neutrophil count | 82.40 | 18.900000 | 18.320 | 1.2% |
| Procalcitonin | 753.66 | 30.122600 | 1.270 | 10.2% |
| Platelet count (PLT) | 1213.00 | 527.830000 | 569.000 | 0.7% |
| Oxygen saturation | 100.00 | 100.000000 | 110.000 | 0.0% |
| Troponin I | 95.40 | 2.189095 | 0.170 | 8.1% |
| White blood cell count (WBC) | 127.50 | 22.600000 | 21.700 | 1.2% |
# Truncate labs using outer fence
truncate_max <- function(v) outer_fence(v)
for (i in 1:length(lab_vars$var)){
original_var <- lab_vars$var[i]
truncated_var <- paste0(original_var, "_t")
truncated_max_i <- truncate_max(train_data[[original_var]])
train_data <- train_data %>% mutate(
!!truncated_var := ifelse(get(original_var) > truncated_max_i,
truncated_max_i,
get(original_var))
)
}
After truncating the labs, the probability distributions appear more reasonable.
lab_vars_t <- lab_vars %>%
mutate(var = paste0(var, "_t"))
vars <- bind_rows(vars, lab_vars_t)
continuous_vars_t <- bind_rows(
demographic_continuous_vars,
vital_continuous_vars,
lab_vars_t
)
continuous_var_t_df <- pivot_continuous_longer(train_data,
vars = continuous_vars_t)
plot_hist(continuous_var_t_df)
x_vars <- c("age")
y_vars <- c("bmi")
p <- vector(mode = "list", length = length(x_vars))
for (i in 1:length(x_vars)){
p[[i]] <- ggplot(train_data, aes_string(x = x_vars[i], y = y_vars[i])) +
geom_point() +
#geom_smooth(method = "loess", formula = y ~ x) +
xlab(get_var_labs(x_vars[i])) +
ylab(get_var_labs(y_vars[i]))
}
do.call("grid.arrange", c(p, nrow = 1))
## Warning: Removed 1627 rows containing missing values (geom_point).
train_data %>%
count(died) %>%
mutate(died = ifelse(died == 1, "Yes", "No"),
prop = n/sum(n)) %>%
ggplot(aes(x = died, y = prop)) +
geom_bar(stat = "identity") +
geom_text(aes(label = formatC(prop, format = "f", digits = 2)),
nudge_y = .03, size = 3) +
xlab("Died") +
ylab("Proportion")
train_data %>%
mutate(months_to_death = death_month - index_month) %>%
filter(!is.na(months_to_death)) %>%
group_by(months_to_death) %>%
tally() %>%
# Plot
ggplot(aes(x = factor(months_to_death), y = n)) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Months from index date to death") +
ylab("Count")
# Select variables for table 1
remove_varnames <- c("race", "ethnicity", "bmi_cat", "hypunc", "hypc",
"diabunc", "diabwc")
all_varnames <- bind_rows(
demographic_cat_vars,
demographic_continuous_vars,
comorbidity_vars,
vital_cat_vars,
vital_continuous_vars,
lab_vars
) %>%
filter(!var %in% remove_varnames) %>%
pull(var)
categorical_varnames <- bind_rows(
demographic_cat_vars,
comorbidity_vars,
vital_cat_vars
) %>%
filter(!var %in% remove_varnames) %>%
pull(var)
non_normal_varnames <- c(
demographic_continuous_vars$var,
vital_continuous_vars$var,
lab_vars$var
)
## Create a TableOne object
table1 <- train_data %>%
select(one_of(all_varnames, "died")) %>%
rename_with(get_var_labs, .cols = all_of(all_varnames)) %>%
rename(Survivor = died) %>%
CreateTableOne(vars = get_var_labs(all_varnames),
factorVars = get_var_labs(categorical_varnames),
strata = "Survivor", addOverall = TRUE,
data = .)
## Print table 1
print(table1, nonnormal = get_var_labs(non_normal_varnames),
cramVars = c("Sex"),
contDigits = 1, missing = TRUE, printToggle = FALSE) %>%
set_colnames(c("Overall", "Survivor", "Non-survivor",
"p", "Test", "Missing")) %>%
kable() %>%
kable_styling()
| Overall | Survivor | Non-survivor | p | Test | Missing | |
|---|---|---|---|---|---|---|
| n | 13666 | 11504 | 2162 | |||
| Sex = Female/Male (%) | 6565/7097 (48.1/51.9) | 5638/5862 (49.0/51.0) | 927/1235 (42.9/57.1) | <0.001 | 0.0 | |
| Geographic division (%) | <0.001 | 2.6 | ||||
| East North Central | 4638 (34.9) | 3965 (35.4) | 673 (32.0) | |||
| East South Central | 45 ( 0.3) | 43 ( 0.4) | 2 ( 0.1) | |||
| Middle Atlantic | 4634 (34.8) | 3842 (34.3) | 792 (37.6) | |||
| Mountain | 147 ( 1.1) | 133 ( 1.2) | 14 ( 0.7) | |||
| New England | 1583 (11.9) | 1272 (11.4) | 311 (14.8) | |||
| Other | 2 ( 0.0) | 2 ( 0.0) | 0 ( 0.0) | |||
| Pacific | 511 ( 3.8) | 438 ( 3.9) | 73 ( 3.5) | |||
| South Atl/West South Crl | 363 ( 2.7) | 316 ( 2.8) | 47 ( 2.2) | |||
| West North Central | 1383 (10.4) | 1189 (10.6) | 194 ( 9.2) | |||
| Smoking (%) | <0.001 | 25.6 | ||||
| Current | 865 ( 8.5) | 786 ( 9.0) | 79 ( 5.3) | |||
| Never | 6220 (61.1) | 5459 (62.8) | 761 (51.3) | |||
| Previous | 3089 (30.4) | 2447 (28.2) | 642 (43.3) | |||
| Race/Ethnicity (%) | <0.001 | 26.2 | ||||
| Non-Hispanic white | 5646 (56.0) | 4455 (53.1) | 1191 (70.3) | |||
| Asian | 362 ( 3.6) | 307 ( 3.7) | 55 ( 3.2) | |||
| Hispanic | 536 ( 5.3) | 481 ( 5.7) | 55 ( 3.2) | |||
| Non-Hispanic black | 3547 (35.2) | 3153 (37.6) | 394 (23.2) | |||
| Age (median [IQR]) | 62.0 [49.0, 75.0] | 59.0 [46.0, 71.0] | 77.0 [67.0, 85.0] | <0.001 | nonnorm | 0.0 |
| Calendar time (median [IQR]) | 46.0 [37.0, 63.0] | 46.0 [37.0, 63.0] | 45.0 [36.0, 59.0] | <0.001 | nonnorm | 0.0 |
| Acute myocardial infarction = Yes (%) | 1534 (11.2) | 1027 ( 8.9) | 507 (23.5) | <0.001 | 0.0 | |
| Congestive heart failure = Yes (%) | 2323 (17.0) | 1603 (13.9) | 720 (33.3) | <0.001 | 0.0 | |
| Peripheral vascular disease = Yes (%) | 1671 (12.2) | 1176 (10.2) | 495 (22.9) | <0.001 | 0.0 | |
| Cerebrovascular disease = Yes (%) | 1439 (10.5) | 1023 ( 8.9) | 416 (19.2) | <0.001 | 0.0 | |
| Dementia = Yes (%) | 1394 (10.2) | 854 ( 7.4) | 540 (25.0) | <0.001 | 0.0 | |
| Chronic pulmonary disease = Yes (%) | 3622 (26.5) | 2928 (25.5) | 694 (32.1) | <0.001 | 0.0 | |
| Rheumatoid disease = Yes (%) | 398 ( 2.9) | 315 ( 2.7) | 83 ( 3.8) | 0.006 | 0.0 | |
| Peptic ulcer disease = Yes (%) | 206 ( 1.5) | 160 ( 1.4) | 46 ( 2.1) | 0.013 | 0.0 | |
| Mild liver disease = Yes (%) | 879 ( 6.4) | 711 ( 6.2) | 168 ( 7.8) | 0.007 | 0.0 | |
| Hemiplegia or paraplegia = Yes (%) | 330 ( 2.4) | 228 ( 2.0) | 102 ( 4.7) | <0.001 | 0.0 | |
| Renal disease = Yes (%) | 2833 (20.7) | 1984 (17.2) | 849 (39.3) | <0.001 | 0.0 | |
| Cancer = Yes (%) | 1677 (12.3) | 1282 (11.1) | 395 (18.3) | <0.001 | 0.0 | |
| Moderate/severe liver disease = Yes (%) | 128 ( 0.9) | 88 ( 0.8) | 40 ( 1.9) | <0.001 | 0.0 | |
| Metastatic cancer = Yes (%) | 277 ( 2.0) | 188 ( 1.6) | 89 ( 4.1) | <0.001 | 0.0 | |
| AIDS/HIV = Yes (%) | 100 ( 0.7) | 88 ( 0.8) | 12 ( 0.6) | 0.361 | 0.0 | |
| Diabetes = Yes (%) | 4612 (33.7) | 3669 (31.9) | 943 (43.6) | <0.001 | 0.0 | |
| Hypertension = Yes (%) | 8000 (58.5) | 6331 (55.0) | 1669 (77.2) | <0.001 | 0.0 | |
| Body Mass Index (BMI) (median [IQR]) | 29.7 [25.5, 35.1] | 30.0 [25.8, 35.4] | 28.1 [24.0, 33.5] | <0.001 | nonnorm | 11.9 |
| Diastolic blood pressure (median [IQR]) | 73.0 [65.5, 80.5] | 74.0 [66.6, 81.0] | 68.0 [60.0, 75.5] | <0.001 | nonnorm | 3.1 |
| Systolic blood pressure (median [IQR]) | 126.0 [115.0, 139.0] | 127.0 [116.0, 139.0] | 122.0 [109.0, 136.5] | <0.001 | nonnorm | 3.2 |
| Heart rate (median [IQR]) | 87.5 [77.5, 98.0] | 87.0 [77.5, 98.0] | 89.0 [77.5, 102.0] | <0.001 | nonnorm | 3.1 |
| Respiration rate (median [IQR]) | 20.0 [18.0, 22.0] | 19.5 [18.0, 21.0] | 22.0 [19.0, 26.0] | <0.001 | nonnorm | 3.9 |
| Temperature (median [IQR]) | 37.0 [36.7, 37.4] | 37.0 [36.7, 37.4] | 37.1 [36.7, 37.6] | <0.001 | nonnorm | 3.1 |
| Alanine aminotransferase (ALT) (median [IQR]) | 28.0 [18.0, 46.0] | 28.0 [18.0, 46.0] | 27.0 [18.0, 44.0] | 0.099 | nonnorm | 20.1 |
| Aspartate aminotransferase (AST) (median [IQR]) | 37.0 [25.0, 58.0] | 35.0 [25.0, 54.5] | 46.0 [30.0, 73.0] | <0.001 | nonnorm | 21.1 |
| C-reactive protein (CRP) (median [IQR]) | 79.1 [34.0, 140.0] | 72.2 [30.0, 130.0] | 116.0 [63.0, 184.0] | <0.001 | nonnorm | 38.7 |
| Creatinine (median [IQR]) | 1.0 [0.8, 1.4] | 1.0 [0.8, 1.3] | 1.3 [1.0, 2.1] | <0.001 | nonnorm | 10.4 |
| Ferritin (median [IQR]) | 510.0 [224.4, 1080.0] | 470.0 [207.1, 992.0] | 747.0 [320.0, 1500.0] | <0.001 | nonnorm | 43.7 |
| Fibrin D-Dimer (median [IQR]) | 750.0 [390.0, 1540.8] | 695.0 [370.0, 1346.5] | 1345.0 [668.2, 3315.0] | <0.001 | nonnorm | 90.4 |
| Lactate dehydrogenase (LDH) (median [IQR]) | 321.0 [238.0, 440.9] | 308.0 [232.0, 415.0] | 404.0 [284.0, 557.0] | <0.001 | nonnorm | 45.3 |
| Lymphocyte count (median [IQR]) | 1.0 [0.7, 1.4] | 1.0 [0.7, 1.4] | 0.8 [0.5, 1.1] | <0.001 | nonnorm | 11.2 |
| Neutrophil count (median [IQR]) | 4.9 [3.4, 7.1] | 4.7 [3.2, 6.7] | 6.1 [4.1, 9.2] | <0.001 | nonnorm | 11.3 |
| Oxygen saturation (median [IQR]) | 96.0 [94.0, 98.0] | 96.0 [94.5, 98.0] | 95.0 [93.0, 97.0] | <0.001 | nonnorm | 3.9 |
| Procalcitonin (median [IQR]) | 0.1 [0.1, 0.4] | 0.1 [0.1, 0.3] | 0.3 [0.1, 1.0] | <0.001 | nonnorm | 49.3 |
| Troponin I (median [IQR]) | 0.0 [0.0, 0.0] | 0.0 [0.0, 0.0] | 0.0 [0.0, 0.1] | <0.001 | nonnorm | 41.2 |
| Platelet count (PLT) (median [IQR]) | 202.0 [157.0, 260.0] | 205.0 [160.0, 262.0] | 187.8 [143.9, 245.0] | <0.001 | nonnorm | 9.9 |
| White blood cell count (WBC) (median [IQR]) | 6.7 [4.9, 9.1] | 6.5 [4.8, 8.7] | 7.7 [5.6, 11.1] | <0.001 | nonnorm | 9.7 |
#summary(table1)
The functional form of the relationship between mortality and the continuous variables is assessed using a series of univariate fits. We mainly rely on visual inspection of the graphs but also report the Bayesian information criterion (BIC).
fit_univariate_logit <- function(var, data){
make_f <- function(rhs){
as.formula(paste("died", rhs, sep =" ~ "))
}
fit_logit <- function(f, data){
glm(f, family = "binomial", data = data)
}
list(
`Linear` = fit_logit(make_f(var), data),
`Spline 3 knots` = fit_logit(make_f(sprintf("ns(%s, 5)", var)), data),
`Spline 4 knots` = fit_logit(make_f(sprintf("ns(%s, 6)", var)), data),
`Spline 5 knots` = fit_logit(make_f(sprintf("ns(%s, 7)", var)), data)
)
}
predict_univariate_logit <- function(models, var, var_values, type = "response"){
newdata <- data.frame(var = var_values)
colnames(newdata) <- var
map_dfc(models,
function(x) predict(x, newdata = newdata, type = type)) %>%
mutate(var = var_values) %>%
pivot_longer(cols = names(models),
names_to = "Model",
values_to = "y")
}
midpoint <- function(x, digits = 2){
lower <- as.numeric(gsub(",.*", "", gsub("\\(|\\[|\\)|\\]", "", x)))
upper <- as.numeric(gsub(".*,", "", gsub("\\(|\\[|\\)|\\]", "", x)))
return(round(lower+(upper-lower)/2, digits))
}
bin_y <- function(var, var_values){
data <- train_data[, c(var, "died")] %>%
filter(!is.na(get(var)))
data <- data %>%
mutate(x_cat = cut(get(var), breaks = 20),
x_midpoint = midpoint(x_cat)) %>%
group_by(x_midpoint) %>%
summarise(y = mean(died),
n = n())
colnames(data)[1] <- var
return(data)
}
plot_univariate_logit <- function(models, var, var_values, var_lab = "Variable",
type = "response", ylab = "Probability of death"){
# Plotting data
predicted_probs <- predict_univariate_logit(models, var, var_values, type = type)
ylab <- switch(type,
"link" = "Log odds",
"response" = "Probability of death",
stop("Type must be 'link' or 'response'")
)
binned_y <- bin_y(var, var_values)
if (type == "link"){
binned_y$y <- ifelse(binned_y$y == 0, .001, binned_y$y)
binned_y$y <- ifelse(binned_y$y == 1, .99, binned_y$y)
binned_y$y <- qlogis(binned_y$y)
}
# Plotting scales
y_min <- min(c(binned_y$y, predicted_probs$y))
y_max <- max(c(binned_y$y, predicted_probs$y))
size_breaks <- seq(min(binned_y$n), max(binned_y$n),
length.out = 6)
# Plot
ggplot(predicted_probs,
aes(x = var, y = y)) +
geom_line() +
geom_point(data = binned_y, aes_string(x = var, y = "y", size = "n")) +
facet_wrap(~Model, ncol = 2) +
xlab(var_lab) +
ylab(ylab) +
ylim(floor(y_min), ceiling(y_max)) +
scale_size(name = "Sample size", range = c(0.3, 3),
breaks = round(size_breaks, 0))
}
make_seq <- function(var){
var_min <- min(train_data[[var]], na.rm = TRUE)
var_max <- max(train_data[[var]], na.rm = TRUE)
seq(var_min, var_max, length.out = 100)
}
evaluate_univariate_logit <- function(var, print = TRUE){
var_values = make_seq(var)
var_lab = get_var_labs(var)
# Do evaluations
fits <- fit_univariate_logit(var, data = train_data)
p_link <- plot_univariate_logit(fits, var, var_values, var_lab, type = "link")
p_probs <- plot_univariate_logit(fits, var, var_values, var_lab, type = "response")
bic <- sapply(fits, BIC)
# Print and return
if (print){
print(p_link)
print(p_probs)
print(bic)
}
return(list(fits = fits, p_link = p_link, p_probs = p_probs,
bic = bic))
}
uv_age <- evaluate_univariate_logit("age")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10152.05 10178.63 10188.22 10197.59
uv_calendar_time <- evaluate_univariate_logit("calendar_time")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11926.53 11947.74 11956.52 11965.13
uv_bmi <- evaluate_univariate_logit("bmi")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10631.88 10625.62 10634.47 10640.01
uv_dbp <- evaluate_univariate_logit("dbp")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11075.50 11067.63 11077.10 11086.28
uv_sbp <- evaluate_univariate_logit("sbp")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11464.70 11247.39 11256.25 11265.91
uv_hr <- evaluate_univariate_logit("hr")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11501.36 11405.73 11413.55 11421.89
uv_resp <- evaluate_univariate_logit("resp")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10738.21 10504.93 10513.53 10521.69
uv_temp <- evaluate_univariate_logit("temp")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11595.41 11353.97 11361.13 11368.32
uv_alt <- evaluate_univariate_logit("alt")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10127.24 10157.07 10164.25 10170.19
uv_alt_t <- evaluate_univariate_logit("alt_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10125.03 10158.67 10168.22 10176.69
uv_ast <- evaluate_univariate_logit("ast")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 9996.867 9838.208 9845.513 9853.245
uv_ast_t <- evaluate_univariate_logit("ast_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 9849.657 9841.263 9850.436 9859.691
uv_crp <- evaluate_univariate_logit("crp")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 7610.895 7598.575 7606.895 7613.547
uv_crp_t <- evaluate_univariate_logit("crp_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 7609.220 7598.523 7606.862 7613.446
uv_creatinine <- evaluate_univariate_logit("creatinine")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10991.89 10511.32 10499.98 10500.91
uv_creatinine_t <- evaluate_univariate_logit("creatinine_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10599.56 10477.76 10485.67 10492.08
uv_ferritin <- evaluate_univariate_logit("ferritin")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 7311.431 7200.894 7211.518 7217.965
uv_ferritin_t <- evaluate_univariate_logit("ferritin_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 7216.741 7203.104 7214.089 7219.304
uv_d_dimer <- evaluate_univariate_logit("d_dimer")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 1201.619 1189.305 1193.129 1197.296
uv_d_dimer_t <- evaluate_univariate_logit("d_dimer_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 1154.320 1177.934 1185.052 1192.091
uv_ldh <- evaluate_univariate_logit("ldh")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 6918.661 6729.046 6731.922 6737.658
uv_ldh_t <- evaluate_univariate_logit("ldh_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 6724.449 6713.154 6722.186 6730.649
uv_lymphocyte <- evaluate_univariate_logit("lymphocyte")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11015.74 10671.85 10679.94 10688.67
uv_lymphocyte_t <- evaluate_univariate_logit("lymphocyte_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10834.08 10664.96 10672.87 10681.37
uv_neutrophil <- evaluate_univariate_logit("neutrophil")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10736.10 10717.76 10718.11 10725.33
uv_neutrophil_t <- evaluate_univariate_logit("neutrophil_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10704.88 10705.77 10711.41 10719.06
uv_spo2 <- evaluate_univariate_logit("spo2")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11140.41 11107.89 11117.45 11127.13
uv_pct <- evaluate_univariate_logit("pct")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 6956.536 6499.081 6498.854 6504.554
uv_pct_t <- evaluate_univariate_logit("pct_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 6566.560 6495.053 6497.433 6506.346
uv_tni <- evaluate_univariate_logit("tni")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 7754.330 7045.431 7050.912 7059.858
uv_tni_t <- evaluate_univariate_logit("tni_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 7077.283 6954.025 6962.474 6970.854
uv_plt <- evaluate_univariate_logit("plt")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11101.79 11079.13 11086.73 11095.29
uv_plt_t <- evaluate_univariate_logit("plt_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 11098.10 11078.63 11086.54 11095.09
uv_wbc <- evaluate_univariate_logit("wbc")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10985.17 10939.94 10944.21 10950.75
uv_wbc_t <- evaluate_univariate_logit("wbc_t")
## Linear Spline 3 knots Spline 4 knots Spline 5 knots
## 10932.45 10925.13 10932.84 10939.63
Let’s specify variables that will be candidates for our model and used during variable selection. We will combine race and ethnicity into a single variable, but will wait to do this until after multiple imputation (see next section).
demographics_to_include <- c("age", "sex", "race", "ethnicity", "division",
"smoke", "calendar_time")
comorbidities_to_include <- comorbidity_vars %>%
filter(!var %in% c("diabunc", "diabwc", "hypunc", "hypc")) %>%
pull(var) %>%
c("diab", "hyp")
vitals_to_include <- c("bmi", "temp", "hr", "resp", "sbp")
labs_to_include <- c("spo2", "crp_t", "tni_t", "ast_t", "ferritin_t",
"creatinine_t", "ldh_t", "lymphocyte_t", "neutrophil_t",
"plt_t", "wbc_t")
vars <- vars %>%
mutate(include = ifelse(var %in% c(demographics_to_include,
comorbidities_to_include,
vitals_to_include,
labs_to_include),
1, 0))
get_included_vars <- function(){
vars[vars$include == 1, ]$var
}
make_rhs <- function(vars){
as.formula(paste0("~", paste(vars, collapse = " + ")))
}
candidate_model_rhs <- make_rhs(get_included_vars())
Note that there is no “Other” geographic region as all 9 of the US geographic divisions are included in the data. We will consequently recode this category to missing.
train_data <- train_data %>%
mutate(division = ifelse(division == "Other", NA, division),
division = ifelse(division %in% c("East South Central", "Mountain"),
"Other", division))
Let’s re-level them so that we have preferred reference categories.
train_data <- train_data %>% mutate(
sex = relevel(factor(sex), ref = "Male"),
division = relevel(factor(division), ref = "Pacific"),
smoke = relevel(factor(smoke), ref = "Never"),
bmi_cat = relevel(factor(bmi_cat), ref = "Normal")
)
We can now multiply impute data use MICE.
# Run MICE algorithm
mice_out <- train_data %>%
select(c(one_of(get_included_vars()), "died")) %>%
mutate_if(is.character, as.factor) %>%
mice(m = n_imputations, maxit = 5)
##
## iter imp variable
## 1 1 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 1 2 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 2 1 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 2 2 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 3 1 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 3 2 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 4 1 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 4 2 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 5 1 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
## 5 2 sex race ethnicity division smoke bmi sbp hr resp temp spo2 ast_t crp_t creatinine_t ferritin_t ldh_t lymphocyte_t neutrophil_t tni_t plt_t wbc_t
# Append datasets and add outcome
mi_df <- complete(mice_out, action = "long", include = TRUE) %>%
as_tibble() %>%
mutate(died = rep(train_data$died, mice_out$m + 1))
# To compare MICE to aregImpute
# areg_out <- aregImpute(update.formula(candidate_model_rhs, ~.),
# n.impute = 2, data = train_data)
The distributions of the imputed and observed data are compared as a diagnostic for the imputation. They look pretty similar suggesting that there is nothing terribly wrong with the imputation.
make_imp_df <- function(object){
# Get imputations
if (inherits(object, "mids")){
imp <- object$imp
} else{ # aregImpute
imp <- object$imputed
for (i in 1:length(imp)){
cat_levels_i <- object$cat.levels[[i]]
if (!is.null(cat_levels_i) && !is.null(imp[[i]])){
levels <- sort(unique(c(imp[[i]])))
imp[[i]] <- apply(imp[[i]],
2,
function(x) factor(x, levels = levels,
labels = cat_levels_i))
}
}
}
# Create list of data frames
is_numeric <- sapply(imp, function (x) is.numeric(x[, 1]))
continuous_df <- vector(mode = "list", length = sum(is_numeric))
cat_df <- vector(mode = "list", length = sum(!is_numeric))
continuous_cntr <- 1
cat_cntr <- 1
for (i in 1:length(imp)){
if(!is.null(nrow(imp[[i]])) && nrow(imp[[i]]) > 0 ){
imp_i_df <- data.frame(var = names(imp)[i],
imp = rep(1:ncol(imp[[i]]), each = nrow(imp[[i]])),
value = c(as.matrix(imp[[i]]))) %>%
as_tibble()
} else{
imp_i_df <- NULL
}
if (is_numeric[i]){
continuous_df[[continuous_cntr]] <- imp_i_df
continuous_cntr <- continuous_cntr + 1
} else{
cat_df[[cat_cntr]] <- imp_i_df
cat_cntr <- cat_cntr + 1
}
}
# Row bind data frames
continuous_df = bind_rows(continuous_df) %>%
mutate(obs = "Imputed",
varlab = get_var_labs(var))
cat_df = bind_rows(cat_df) %>%
mutate(obs = "Imputed",
varlab = get_var_labs(var))
# Return
return(list(continuous = continuous_df,
cat = cat_df))
}
imp_df <- make_imp_df(mice_out)
#imp_df <- make_imp_df(areg_out)
# Plot continuous variables
## Data for plotting
obsimp_df_continuous <- bind_rows(
imp_df$continuous,
continuous_var_df %>%
select(var, value, varlab) %>%
mutate(imp = 0, obs = "Observed")
) %>%
mutate(imp = ifelse(imp == 0, "Observed", paste0("Imputation ", imp))) %>%
filter(var %in% unique(imp_df$continuous$var))
## Plot
ggplot(obsimp_df_continuous,
aes(x = value, col = imp)) +
geom_density(position = "jitter") +
facet_wrap(~varlab, scales = "free", ncol = 3) +
xlab("") + ylab("Density") +
scale_color_discrete(name = "") +
theme(legend.position = "bottom")
# Plot categorical variables
## Data for plotting
obsimp_df_cat <-
bind_rows(
imp_df$cat %>%
group_by(var, varlab, value, imp) %>%
summarise(n = n()) %>%
group_by(var, varlab, imp) %>%
mutate(freq = n / sum(n)),
cat_var_df %>%
select(var, value, varlab, n, freq) %>%
mutate(imp = 0, obs = "Observed")
) %>%
mutate(imp = ifelse(imp == 0, "Observed", paste0("Imputation ", imp))) %>%
filter(var %in% unique(imp_df$cat$var))
## `summarise()` regrouping output by 'var', 'varlab', 'value' (override with `.groups` argument)
# Plot
ggplot(obsimp_df_cat,
aes(x = value, y = freq, fill = imp)) +
geom_bar(position = "dodge", stat = "identity") +
facet_wrap(~varlab, scales = "free_x") +
scale_fill_discrete(name = "") +
xlab("") +
ylab("Proportion") +
theme(legend.position = "bottom",
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
We create the race/ethnicity variable after imputation of missing values of race and ethnicity. We will also combine the diabetes and hypertension variables.
mi_df <- mi_df %>%
add_race_ethnicity()
prop.table(table(mi_df %>% filter(.imp == 0) %>% pull(diab)))
##
## No Yes
## 0.6625201 0.3374799
prop.table(table(mi_df %>% filter(.imp == 0) %>% pull(hyp)))
##
## No Yes
## 0.4146056 0.5853944
Now lets use the combined race and ethnicity category in our model.
vars <- vars %>%
mutate(include = case_when(
var == "race_ethnicity" ~ 1,
var == "race" ~ 0,
var == "ethnicity" ~ 0,
TRUE ~ include
)
)
candidate_model_rhs <- make_rhs(get_included_vars())
Finally, we will (i) add the new race/ethnicity variable to the “MICE” object and (ii) create a list of imputed datasets for analysis.
mice_out <- as.mids(mi_df)
mi_list <- mi_df %>%
filter(.imp > 0) %>%
split(list(.$.imp))
The continuous variables in our model are transformed using restricted cubic splines with 3 knots.
candidate_model_rhs <- candidate_model_rhs %>%
update.formula( ~. + rcs(age, 3) - age +
rcs(calendar_time, 3) - calendar_time +
rcs(bmi, 3) - bmi +
rcs(temp, 3) - temp +
rcs(hr, 3) - hr +
rcs(resp, 3) - resp +
rcs(sbp, 3) - sbp +
rcs(spo2, 3) - spo2 +
rcs(crp_t, 3) - crp_t +
rcs(tni_t, 3) - tni_t +
rcs(ast_t, 3) - ast_t +
rcs(creatinine_t, 3) - creatinine_t +
rcs(ferritin_t, 3) - ferritin_t +
rcs(ldh_t, 3) - ldh_t,
rcs(lymphocyte_t, 3) - lymphocyte_t +
rcs(neutrophil_t, 3) - neutrophil_t +
rcs(plt_t, 3) - plt_t +
rcs(wbc_t, 3) - wbc_t)
With oem we need to create an x matrix since there is no formula interface.
rename_rcs <- function(v){
rcs_ind <- grep("rcs", v)
v[rcs_ind] <- sub("rcs.*)", "", v[rcs_ind])
return(v)
}
rename_terms <- function(v){
v <- gsub(" ", "_", v)
v <- gsub("-", "", v)
v <- gsub("/", "", v)
v <- rename_rcs(v)
return(v)
}
make_x <- function(data){
x <- model.matrix(candidate_model_rhs, data)
assign <- attr(x, "assign")
colnames(x) <- rename_terms(colnames(x))
x <- x[, -1]
attr(x, "assign") <- assign[-1]
return(x)
}
# List of x and y for each imputed dataset
x <- mi_list %>% map(make_x)
y <- mi_list %>% map(function(x) x[["died"]])
To make the graphs look nice, we will import labels for the model terms.
terms <- read.csv("risk-factors-terms.csv") %>%
left_join(vars[, c("var", "group")], by = "var")
get_term_labs <- function(v, term_name = "term"){
terms$termlab[match(v, terms[[term_name]])]
}
match_terms_to_vars <- function(t){
terms$var[match(t, terms$term)]
}
Finally, we run the grouped lasso.
# Number of folds for cross-validation
n_folds <- 10
# Threshold for variable inclusion
inclusion_threshold <- 0.9
# Matrix to store inclusion results
inclusion_sim <- matrix(0, ncol = ncol(x[[1]]) + 1,
nrow = n_rep * n_imputations)
# Groups
groups <- attr(x[[1]], "assign")
# Convenience function to extract coefficients from Group-Lasso
coef_cv_oem <- function(object){
coef <- object$oem.fit$beta$grp.lasso
lse_ind <- which(object$lambda[[1]] == object$lambda.1se.models)
return(coef[, lse_ind])
}
# Variable selection via Group-Lasso
cntr <- 1
for (i in 1:n_imputations){
for (j in 1:n_rep){
# Cross-validation
cv_fit <- cv.oem(x = x[[i]], y = y[[i]],
penalty = "grp.lasso",
groups = groups,
family = "binomial",
type.measure = "deviance",
nfolds = n_folds
)
# Count nonzero coefficients
inclusion_sim[cntr, which(coef_cv_oem(cv_fit) != 0)] <- 1
# Iterate
cntr <- cntr + 1
} # End repeated CV loop
} # End imputation loop
Let’s check the inclusion probabilities from the above repeated cross-validation steps.
# Percentage of simulations each term is included
inclusion_sim <- inclusion_sim[, -1] # Remove intercept
colnames(inclusion_sim) = colnames(x[[1]])
inclusion_summary <- tibble(term = colnames(inclusion_sim),
prob = apply(inclusion_sim, 2, mean))
model_terms <- inclusion_summary %>%
filter(prob >= inclusion_threshold) %>%
pull(term)
# Percentage of simulations each variable is included
inclusion_summary <- inclusion_summary %>%
mutate(var = match_terms_to_vars(term)) %>%
mutate(varlab = get_var_labs(var)) %>%
distinct(prob, var, varlab)
# Plot
ggplot(inclusion_summary,
aes(x = reorder(varlab, prob), y = prob)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = inclusion_threshold, linetype = "dotted",
color = "red", size = 1) +
ylab("Probability of inclusion") +
coord_flip() +
theme(axis.title.y = element_blank())
Variables are included in the model based on the group Lasso simulation implemented above.
vars_to_exclude <- inclusion_summary %>%
filter(prob < inclusion_threshold) %>%
pull(var)
remove_terms_from_rhs <- function(f, vars_to_exclude){
# First convert formula to string separated by +
f_string <- Reduce(paste, deparse(f))
f_string <- gsub("~", "", f_string)
f_string <- gsub(" ", "", f_string)
# Then convert string to vector
f_vec <- unlist(strsplit(f_string, "\\+"))
pattern_to_exclude <- paste(vars_to_exclude, collapse = "|")
f_vec <- f_vec[!grepl(pattern_to_exclude, f_vec)]
# Convert string back to formula
f_new <- paste0("~", paste(f_vec, collapse = " + "))
return(as.formula(f_new))
}
model_rhs <- remove_terms_from_rhs(candidate_model_rhs, vars_to_exclude)
label(mi_df) <- map(colnames(mi_df),
function(x) label(mi_df[, x]) <- get_var_labs(x))
dd <- datadist(mi_df, adjto.cat = "first")
options(datadist = "dd")
As described in the paper, four models are fit that include the following predictors: (i) age only, (ii) all demographics (and calendar time), and (iii) demographics (and calendar time) and comorbidities, and (iv) all variables selected by the grouped lasso.
f_logit_all <- update.formula(model_rhs, died ~ .)
# The four models
lrm_names <- c("Age only",
"All demographics",
"Demographics and comorbidities",
"All variables")
## (1): fit_age: Only includes age
f_logit_age <- died ~ age
lrm_fit_age <- fit.mult.impute(f_logit_age, fitter = lrm, xtrans = mice_out, pr = FALSE,
x = TRUE, y = TRUE)
## (2): fit_d: All demographics including age
f_logit_d <- update.formula(f_logit_age, ~. + sex + rcs(calendar_time, 3) +
race_ethnicity + division)
lrm_fit_d <- fit.mult.impute(f_logit_d, fitter = lrm, xtrans = mice_out, pr = FALSE,
x = TRUE, y = TRUE)
## (3): fit_dc: Demographics and comorbidities
c_vars <- vars %>%
filter(group == "Comorbidities" & include == 1 & !var %in% vars_to_exclude) %>%
pull(var)
f_logit_dc <- update.formula(f_logit_d,
as.formula(paste0("~.+", paste(c_vars, collapse = "+"))))
lrm_fit_dc <- fit.mult.impute(f_logit_dc, fitter = lrm, xtrans = mice_out, pr = FALSE,
x = TRUE, y = TRUE)
## (4): fit: The main model including demographics, comorbidities, vitals, and labs
lrm_fit_all <- fit.mult.impute(f_logit_all, fitter = lrm, xtrans = mice_out,
pr = FALSE, x = TRUE, y = TRUE)
# Note that we can also estimate the models with stats::glm
glm_fits_all <- mi_list %>%
map(function (x) glm(f_logit_all, data = x, family = "binomial"))
glm_fit_all <- glm_fits_all %>% pool()
We will print the results of the full model for the interested reader
lrm_fit_all
## Logistic Regression Model
##
## fit.mult.impute(formula = f_logit_all, fitter = lrm, xtrans = mice_out,
## pr = FALSE, x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 13666 LR chi2 4231.45 R2 0.457 C 0.890
## 0 11504 d.f. 59 g 2.402 Dxy 0.780
## 1 2162 Pr(> chi2) <0.0001 gr 11.047 gamma 0.781
## max |deriv| 2e-09 gp 0.205 tau-a 0.208
## Brier 0.087
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 13.6351 2.6127 5.22 <0.0001
## sex=Female -0.0229 0.0667 -0.34 0.7307
## division=East North Central -0.3840 0.1941 -1.98 0.0479
## division=Middle Atlantic -0.2833 0.1728 -1.64 0.1011
## division=New England -0.1864 0.1980 -0.94 0.3466
## division=Other -0.8502 0.3351 -2.54 0.0112
## division=South Atl/West South Crl -0.4907 0.2631 -1.86 0.0622
## division=West North Central -0.4450 0.2095 -2.12 0.0337
## smoke=Current -0.0405 0.1332 -0.30 0.7613
## smoke=Previous 0.1762 0.0908 1.94 0.0523
## race_ethnicity=Asian -0.1019 0.1903 -0.54 0.5924
## race_ethnicity=Hispanic -0.1858 0.1254 -1.48 0.1386
## race_ethnicity=Non-Hispanic black -0.5651 0.0875 -6.46 <0.0001
## ami=Yes 0.0366 0.0851 0.43 0.6671
## chf=Yes 0.1837 0.0769 2.39 0.0169
## pvd=Yes 0.0599 0.0821 0.73 0.4660
## cevd=Yes 0.0943 0.0885 1.07 0.2862
## dementia=Yes 0.3787 0.0834 4.54 <0.0001
## copd=Yes 0.0498 0.0700 0.71 0.4764
## rheumd=Yes 0.0571 0.1586 0.36 0.7189
## mld=Yes 0.0839 0.1330 0.63 0.5281
## hp=Yes 0.7286 0.1583 4.60 <0.0001
## rend=Yes 0.1088 0.0823 1.32 0.1862
## canc=Yes 0.0325 0.0860 0.38 0.7053
## msld=Yes 0.7995 0.2551 3.13 0.0017
## metacanc=Yes 0.8501 0.1826 4.65 <0.0001
## aids=Yes 0.6647 0.3651 1.82 0.0687
## diab=Yes 0.1244 0.0664 1.87 0.0609
## hyp=Yes 0.1104 0.0787 1.40 0.1606
## lymphocyte_t -0.2263 0.0561 -4.03 <0.0001
## plt_t -0.0025 0.0004 -6.15 <0.0001
## wbc_t 0.0465 0.0112 4.14 <0.0001
## age 0.0672 0.0044 15.39 <0.0001
## age' -0.0046 0.0056 -0.82 0.4106
## calendar_time -0.0176 0.0047 -3.70 0.0002
## calendar_time' 0.0079 0.0081 0.97 0.3334
## bmi -0.0254 0.0101 -2.53 0.0115
## bmi' 0.0460 0.0119 3.88 0.0001
## temp -0.1885 0.0530 -3.55 0.0004
## temp' 0.9287 0.0947 9.81 <0.0001
## hr 0.0020 0.0044 0.45 0.6526
## hr' 0.0109 0.0049 2.22 0.0265
## resp 0.0415 0.0259 1.60 0.1090
## resp' 0.0429 0.0250 1.72 0.0858
## sbp -0.0334 0.0036 -9.34 <0.0001
## sbp' 0.0314 0.0043 7.34 <0.0001
## spo2 -0.1232 0.0169 -7.27 <0.0001
## spo2' 0.1289 0.0279 4.61 <0.0001
## crp_t 0.0033 0.0018 1.84 0.0651
## crp_t' -0.0035 0.0020 -1.73 0.0838
## tni_t 17.5611 3.6648 4.79 <0.0001
## tni_t' -47.8269 13.7956 -3.47 0.0005
## ast_t 0.0081 0.0042 1.90 0.0573
## ast_t' -0.0155 0.0076 -2.04 0.0417
## creatinine_t 0.2842 0.1299 2.19 0.0288
## creatinine_t' -0.1662 0.2269 -0.73 0.4638
## ferritin_t 0.0002 0.0002 0.90 0.3685
## ferritin_t' -0.0003 0.0005 -0.72 0.4688
## ldh_t 0.0050 0.0013 3.82 0.0001
## ldh_t' -0.0042 0.0018 -2.38 0.0172
##
Let’s examine the extent to which our predictors are collinear: for categorical and continous variables, we use an anova model; for categorical and categorical variables, we use Cramer’s V; and for continous and continous variables, we use spearman correlation. (Note that this takes a long time to run and could probablty be made more efficient.)
## from https://stackoverflow.com/questions/52554336/plot-the-equivalent-of-correlation-matrix-for-factors-categorical-data-and-mi
mixed_assoc <- function(df, cor_method = "spearman", adjust_cramersv_bias = TRUE){
df_comb <- expand.grid(names(df), names(df), stringsAsFactors = F) %>%
set_names("X1", "X2")
is_nominal <- function(x) inherits(x, c("factor", "character"))
# https://community.rstudio.com/t/why-is-purr-is-numeric-deprecated/3559
# https://github.com/r-lib/rlang/issues/781
is_numeric <- function(x) { is.integer(x) || is_double(x)}
f <- function(x_name, y_name) {
x <- pull(df, x_name)
y <- pull(df, y_name)
result <- if(is_nominal(x) && is_nominal(y)){
# use bias corrected cramersV as described in https://rdrr.io/cran/rcompanion/man/cramerV.html
cv <- cramerV(as.character(x), as.character(y),
bias.correct = adjust_cramersv_bias)
data.frame(x_name, y_name, assoc = cv, type = "cramersV")
} else if(is_numeric(x) && is_numeric(y)){
correlation <- cor(x, y, method = cor_method, use = "complete.obs")
data.frame(x_name, y_name, assoc = correlation, type = "correlation")
} else if(is_numeric(x) && is_nominal(y)){
# from https://stats.stackexchange.com/questions/119835/correlation-between-a-nominal-iv-and-a-continuous-dv-variable/124618#124618
r_squared <- summary(lm(x ~ y))$r.squared
data.frame(x_name, y_name, assoc = sqrt(r_squared), type = "anova")
} else if(is_nominal(x) && is_numeric(y)){
r_squared <- summary(lm(y ~ x))$r.squared
data.frame(x_name, y_name, assoc = sqrt(r_squared), type = "anova")
} else {
warning(paste("unmatched column type combination: ", class(x), class(y)))
}
# finally add complete obs number and ratio to table
result %>%
mutate(
complete_obs_pairs = sum(!is.na(x) & !is.na(y)),
complete_obs_ratio = complete_obs_pairs/length(x)) %>%
rename(x = x_name, y = y_name)
}
# apply function to each variable combination
map2_df(df_comb$X1, df_comb$X2, f)
}
# Create correlation matrix of associations
corr_mat <- mi_df %>%
filter(.imp == 1) %>%
select(any_of(get_included_vars())) %>%
mixed_assoc() %>%
select(x, y, assoc) %>%
pivot_wider(names_from = y, values_from = assoc) %>%
column_to_rownames("x") %>%
as.matrix
# Make tile plot
m <- abs(corr_mat)
heatmap_df <- tibble(row = rownames(m)[row(m)],
col = colnames(m)[col(m)], corr = c(m)) %>%
mutate(row = get_var_labs(row),
col = get_var_labs(col))
heatmap_df %>%
ggplot(aes(x = row, y = col, fill = corr)) +
geom_tile() +
scale_fill_continuous("Correlation") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
axis.title = element_blank())
Variable importance is assessed with a Wald test using the rms::anova().
# Compute variable important with Wald chi-square
lrm_anova_all <- anova(lrm_fit_all)
# Plot the result
lrm_anova_all %>%
as_tibble() %>%
mutate(var = gsub(" ", "", rownames(lrm_anova_all)),
varlab = get_var_labs(var),
value = as.double(`Chi-Square` - `d.f.`)) %>%
filter(!var %in% c("TOTAL", "Nonlinear", "TOTALNONLINEAR")) %>%
ggplot(aes(x = value, y = reorder(varlab, value))) +
geom_point() +
theme(axis.title.y = element_blank()) +
xlab(expression(chi^2-df))
The model is summarized with odds ratios using rms::summary(). We will start by assessing the full model. Next, since labs and vitals may themselves be “caused” by comorbidities, we will see how the odds ratios for the comorbidities change after dropping labs and vitals from the model.
lrm_summary_all <- summary(lrm_fit_all)
# Odds ratios
format_or_range <- function(x, term){
case_when(
x < 10 ~ formatC(x, format = "f", digits = 2),
term == "temp" ~ formatC(x, format = "f", digits = 1),
TRUE ~ formatC(x, format = "f", digits = 0)
)
}
make_tidy_or <- function(object, model_name = NULL){
if (is.null(model_name)) model_name <- "Model"
object %>%
as.data.frame() %>%
as_tibble() %>%
mutate(term = rownames(object),
High = format_or_range(High, term),
Low = format_or_range(Low, term),
termlab = get_term_labs(term, "term2"),
termlab = ifelse(!is.na(`Diff.`),
paste0(termlab, " - ", High, ":", Low),
termlab),
or = exp(Effect),
or_lower = as.double(exp(`Lower 0.95`)),
or_upper = exp(`Upper 0.95`)) %>%
filter(Type == 1) %>%
select(term, termlab, or, or_lower, or_upper) %>%
arrange(-or) %>%
mutate(model = model_name)
}
lrm_or_all <- make_tidy_or(lrm_summary_all, "All variables")
# Odds ratio plot
ggplot(lrm_or_all,
aes(x = or, y = reorder(termlab, or))) +
geom_point() +
geom_errorbarh(aes(xmax = or_upper, xmin = or_lower,
height = .2)) +
geom_vline(xintercept = 1, linetype = "dashed", col = "grey") +
theme(axis.title.y = element_blank()) +
xlab("Odds ratio")
lrm_summary_dc <- summary(lrm_fit_dc)
lrm_or_dc <- make_tidy_or(lrm_summary_dc, "Demographics + comorbidities")
# Odds ratio comparison plot
lrm_or_comp <- bind_rows(lrm_or_all, lrm_or_dc) %>%
filter(term %in%
terms[terms$group %in% c("Demographics", "Comorbidities"), ]$term2) %>%
mutate(termlab = factor(termlab),
termlab = reorder(termlab, or, function (x) -mean(x)))
ggplot(lrm_or_comp,
aes(x = termlab, y = or, col = model)) +
geom_point(position = position_dodge(width = 1)) +
geom_errorbar(aes(ymax = or_upper, ymin = or_lower,
width = .2), position = position_dodge(width = 1)) +
facet_wrap(~termlab, strip.position = "left", ncol = 1, scales = "free_y") +
geom_hline(yintercept = 1, linetype = "dashed") +
theme(axis.title.y = element_blank()) +
scale_color_discrete(name = "Model") +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
strip.text.y.left = element_text(hjust = 0, vjust = 1,
angle = 0, size = 8),
legend.position = "bottom") +
ylab("Odds ratio") +
coord_flip()
One limitation of the odds ratio plots is that it is hard to examine the potentially non-linear relationships between mortality and the continuous variable. The rms::Predict() function is especially helpul in this regard. We use it to vary all of the predictors and plot predicted log odds across the different values of each predictor.
Each prediction is made with all other variables at their “Adjust to” value as specified with datadist() above.
t(dd$limits) %>%
as_tibble() %>%
mutate(Variable = get_var_labs(colnames(dd$limits))) %>%
relocate(Variable, .before = "Low:effect") %>%
filter(!is.na(Variable)) %>%
arrange(Variable) %>%
kable() %>%
kable_styling()
| Variable | Low:effect | Adjust to | High:effect | Low:prediction | High:prediction | Low | High |
|---|---|---|---|---|---|---|---|
| Acute myocardial infarction | NA | No | NA | No | Yes | No | Yes |
| Age | 49 | 62 | 75 | 18 | 89 | 18 | 89 |
| AIDS/HIV | NA | No | NA | No | Yes | No | Yes |
| Aspartate aminotransferase (AST) | 25 | 36 | 57 | 6 | 157 | 4 | 157 |
| Body Mass Index (BMI) | 25.54000 | 29.73000 | 35.16000 | 13.32705 | 76.58000 | 11.84000 | 149.50000 |
| C-reactive protein (CRP) | 29.0 | 72.0 | 133.0 | 0.2 | 458.0 | 0.0 | 458.0 |
| Calendar time | 37 | 46 | 63 | 0 | 90 | 0 | 90 |
| Cancer | NA | No | NA | No | Yes | No | Yes |
| Cerebrovascular disease | NA | No | NA | No | Yes | No | Yes |
| Chronic pulmonary disease | NA | No | NA | No | Yes | No | Yes |
| Congestive heart failure | NA | No | NA | No | Yes | No | Yes |
| Creatinine | 0.80 | 1.00 | 1.40 | 0.23 | 3.24 | 0.19 | 3.24 |
| Dementia | NA | No | NA | No | Yes | No | Yes |
| Diabetes | NA | No | NA | No | Yes | No | Yes |
| Ethnicity | NA | Hispanic | NA | Hispanic | Not Hispanic | Hispanic | Not Hispanic |
| Ferritin | 205.000 | 464.900 | 991.000 | 4.000 | 3646.875 | 3.000 | 3646.875 |
| Geographic division | NA | Pacific | NA | Pacific | West North Central | Pacific | West North Central |
| Heart rate | 77.5 | 87.5 | 98.0 | 38.0 | 164.0 | 20.0 | 203.0 |
| Hemiplegia or paraplegia | NA | No | NA | No | Yes | No | Yes |
| Hypertension | NA | No | NA | No | Yes | No | Yes |
| Lactate dehydrogenase (LDH) | 230.00 | 308.00 | 422.00 | 65.00 | 1049.50 | 24.99 | 1049.50 |
| Lymphocyte count | 0.7 | 1.0 | 1.4 | 0.0 | 3.5 | 0.0 | 3.5 |
| Metastatic cancer | NA | No | NA | No | Yes | No | Yes |
| Mild liver disease | NA | No | NA | No | Yes | No | Yes |
| Moderate/severe liver disease | NA | No | NA | No | Yes | No | Yes |
| Neutrophil count | 3.36 | 4.90 | 7.10 | 0.00 | 18.32 | 0.00 | 18.32 |
| Oxygen saturation | 94.0 | 96.0 | 98.0 | 58.5 | 100.0 | 26.0 | 100.0 |
| Peptic ulcer disease | NA | No | NA | No | Yes | No | Yes |
| Peripheral vascular disease | NA | No | NA | No | Yes | No | Yes |
| Platelet count (PLT) | 158 | 203 | 261 | 7 | 569 | 1 | 569 |
| Race | NA | African American | NA | African American | Caucasian | African American | Caucasian |
| Race/Ethnicity | NA | Non-Hispanic white | NA | Non-Hispanic white | Non-Hispanic black | Non-Hispanic white | Non-Hispanic black |
| Renal disease | NA | No | NA | No | Yes | No | Yes |
| Respiration rate | 18.00000 | 20.00000 | 22.00000 | 5.00000 | 56.25904 | 2.00000 | 99.00000 |
| Rheumatoid disease | NA | No | NA | No | Yes | No | Yes |
| Sex | NA | Male | NA | Male | Female | Male | Female |
| Smoking | NA | Never | NA | Never | Previous | Never | Previous |
| Systolic blood pressure | 115 | 126 | 139 | 45 | 225 | 30 | 266 |
| Temperature | 36.70000 | 37.00000 | 37.40000 | 16.00000 | 41.22064 | 16.00000 | 42.20000 |
| Troponin I | 0.010 | 0.010 | 0.045 | 0.000 | 0.170 | 0.000 | 0.170 |
| White blood cell count (WBC) | 4.91 | 6.70 | 9.10 | 0.30 | 21.70 | 0.20 | 21.70 |
Let’s make the predictions.
lrm_log_odds <- Predict(lrm_fit_all, ref.zero = TRUE)
# Get plotting data
p_log_odds <- ggplot(lrm_log_odds, sepdiscrete = "list")
# Continuous plot
log_odds_limit <- max(ceiling(abs(p_log_odds$continuous$data$yhat)))
log_odds_breaks <- seq(-log_odds_limit, log_odds_limit, 1)
p_log_odds_continuous <- p_log_odds$continuous$data %>%
as_tibble() %>%
mutate(varlab = get_var_labs(.predictor.)) %>%
ggplot(aes(x = .xx., y = yhat)) +
facet_wrap(~varlab, scales = "free_x", ncol = 4) +
geom_line() +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey") +
ylab("Log odds") +
scale_y_continuous(breaks = log_odds_breaks,
limits = c(-log_odds_limit, log_odds_limit)) +
theme(axis.title.x = element_blank(),
strip.text = element_text(size = 7))
# Discrete plot
log_odds_limit <- max(ceiling(abs(p_log_odds$discrete$data$yhat)))
log_odds_breaks <- seq(-log_odds_limit, log_odds_limit, 1)
p_log_odds_discrete <- p_log_odds$discrete$data %>%
as_tibble() %>%
mutate(varlab = get_var_labs(.predictor.)) %>%
ggplot(aes(x = yhat, y = .xx.)) +
facet_wrap(~varlab, scales = "free_y", ncol = 4) +
geom_point(size = .9) +
geom_errorbarh(aes(xmin = lower , xmax = upper, height = 0)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey") +
xlab("Log odds") +
scale_x_continuous(breaks = log_odds_breaks,
limits = c(-log_odds_limit, log_odds_limit)) +
theme(axis.title.y = element_blank(),
strip.text = element_text(size = 7))
# Combine plots
grid.arrange(p_log_odds_discrete, p_log_odds_continuous,
heights = c(5, 5))
## Warning: Removed 6 rows containing missing values (geom_errorbarh).
The predicted log odds plot is a nice way to summarize non-linear effects, but it’s not the ultimate quantity of interest. We really care about predicted probabilities. But to compute a predicted probability we need to set all covariates in the model to certain values. Rather that choosing specific values (i.e., creating a representative patient), we will make predictions over a representative sample and average predictions across the sample.
# Make newdata
## Start with a random sample of patients
n_samples <- 1000
train_sample <- mi_list[[1]] %>%
filter(.imp > 0) %>%
sample_n(size = n_samples)
We will write a general function to predict mortality as a function of (i) our fitted model, (ii) the representative sample, and (iii) the variables that we are varying.
expand_newdata <- function(data, vars_to_vary){
varnames_to_vary <- names(vars_to_vary)
expanded_vars_to_vary <- expand.grid(vars_to_vary)
data %>%
select(-all_of(varnames_to_vary)) %>%
crossing(expanded_vars_to_vary)
}
predict_mortality <- function(fit, newdata, vars_to_vary){
expanded_newdata <- expand_newdata(newdata, vars_to_vary)
fit_probs <- predict(fit, newdata = data.frame(expanded_newdata),
type = "fitted", se.fit = FALSE)
# Average predictions for variables to vary
expanded_newdata %>%
mutate(prob = fit_probs) %>%
group_by(across(all_of(names(vars_to_vary)))) %>%
summarise(prob = mean(prob))
}
predict_mortality_boot <- function(fit, newdata, vars_to_vary, B = 100){
n_obs <- nrow(newdata)
boot_probs <- vector(mode = "list", length = B)
for (b in 1:B){
indices <- sample(x = 1:n_obs, size = n_obs, replace = TRUE)
newdata_boot <- newdata[indices, ]
boot_probs[[b]] <- predict_mortality(fit, newdata_boot, vars_to_vary)
boot_probs[[b]][, "b"] <- b
}
return(boot_probs %>% bind_rows())
}
ages_to_vary <- seq(min(train_data$age), max(train_data$age), 1)
probs_age <- predict_mortality(lrm_fit_all, newdata = train_sample,
vars_to_vary = list(age = ages_to_vary)) %>%
rename(Age = age, `Mortality probability` = prob)
## `summarise()` ungrouping output (override with `.groups` argument)
# Table using DT package
datatable(probs_age,
rownames = FALSE,
options = list(pageLength = 20)) %>%
formatRound("Mortality probability", 4)
ggplot(probs_age, aes(x = `Age`, y = `Mortality probability`)) +
geom_line()
min_index_date <- min(train_data$index_date)
calendar_times_to_vary <- c(as.Date("2020-03-01"),
as.Date("2020-04-01"),
as.Date("2020-05-01")) - min_index_date
calendar_times_to_vary <- as.numeric(calendar_times_to_vary)
probs_calendar_time <- predict_mortality(
lrm_fit_all, newdata = train_sample,
vars_to_vary = list(calendar_time = calendar_times_to_vary)
) %>%
mutate(Date = min_index_date + calendar_time) %>%
rename(`Calendar time` = calendar_time, `Mortality probability` = prob) %>%
relocate(Date)
## `summarise()` ungrouping output (override with `.groups` argument)
kable(probs_calendar_time) %>% kable_styling()
| Date | Calendar time | Mortality probability |
|---|---|---|
| 2020-03-01 | 9 | 0.2276039 |
| 2020-04-01 | 40 | 0.1724317 |
| 2020-05-01 | 70 | 0.1395255 |
We will start by predicting point estimates only.
sex_to_vary <- unique(train_data$sex) %>%
.[!is.na(.)]
# Predict point estimates
probs_age_sex_calendar <- predict_mortality(
lrm_fit_all, newdata = train_sample,
vars_to_vary = list(age = ages_to_vary,
sex = sex_to_vary,
calendar_time = calendar_times_to_vary))
## `summarise()` regrouping output by 'age', 'sex' (override with `.groups` argument)
# Summarize in table
probs_age_sex_calendar %>%
arrange(calendar_time, sex, age) %>%
mutate(Date = min_index_date + calendar_time) %>%
select(-calendar_time) %>%
relocate(Date, sex) %>%
rename(`Sex` = sex, `Age` = age,
`Mortality probability` = prob) %>%
## Using DT::datatable
datatable(rownames = FALSE,
filter = "top",
options = list(pageLength = 20)) %>%
formatRound("Mortality probability", 4)
We we will also compute 95% confidence intervals by bootstrapping.
bootprobs_age_sex_calendar <- predict_mortality_boot(
lrm_fit_all, newdata = train_sample,
vars_to_vary = list(age = ages_to_vary,
sex = sex_to_vary,
calendar_time = calendar_times_to_vary),
B = n_boot_probs)
Let’s plot the results.
bootprobs_age_sex_calendar %>%
# Summarize bootstrap results
group_by(age, sex, calendar_time) %>%
summarise(
prob_mean = mean(prob),
prob_lower = quantile(prob, .025),
prob_upper = quantile(prob, .975)
) %>%
mutate(date = min_index_date + calendar_time) %>%
# Plot
ggplot(aes(x = age, y = prob_mean, color = sex, fill = sex)) +
geom_line() +
facet_wrap(~date) +
geom_line(aes(x = age, y = prob_lower, color = sex), linetype = "dotted") +
geom_line(aes(x = age, y = prob_upper, color = sex), linetype = "dotted") +
xlab("Age") +
ylab("Mortality probability") +
scale_color_discrete("") +
scale_fill_discrete("") +
theme(legend.position = "bottom")
## `summarise()` regrouping output by 'age', 'sex' (override with `.groups` argument)
We will start by using bootstrapping to estimate model performance and check for whether our in-sample fits are too optimistic. Specifically, we will use the following algorithm implemented in the rms package:
A shrinkage factor can also be estimated within each bootstrap sample to gauge the extent of overfitting. This is done by fitting \(g(Y) = \gamma_0 + \gamma_1 X\hat{\beta}\) where \(X\) and \(Y\) are the covariates and outcome, respectively, in the test sample (i.e., in step 2c) and \(\hat{\beta}\) is estimating in the training sample (i.e., step 2b). If there is no overfitting, then \(\gamma_0 = 0\) and \(\gamma_1 = 1\); conversely, if there is overfitting, then \(\gamma_1 < 1\) and \(\gamma_0 \neq 1\) to compensate.
lrm_val_age <- validate(lrm_fit_age, B = n_boot_val)
lrm_val_d <- validate(lrm_fit_d, B = n_boot_val)
lrm_val_dc <- validate(lrm_fit_dc, B = n_boot_val)
lrm_val_all <- validate(lrm_fit_all, B = n_boot_val)
bind_cindex <- function(object){
n_rows <- nrow(object)
c_index <- (object["Dxy", 1:3] + 1)/2
c_index[4] <- c_index[2] - c_index[3]
c_index[5] <- c_index[1] - c_index[4]
c_index[6] <- object[1, 6]
return(rbind(object, c_index))
}
make_validation_tbl <- function(object){
object %>%
bind_cindex() %>%
set_colnames(c("(1) Original", "(2) Bootstrap training",
"(3) Bootstrap test", "Optimism: (2) - (3)",
"Original (corrected): (1) - (4)", "N")) %>%
kable() %>%
kable_styling()
}
make_validation_tbl(lrm_val_age)
|
|
|
Optimism: (2) - (3) | Original (corrected): (1) - (4) | N | |
|---|---|---|---|---|---|---|
| Dxy | 0.5492928 | 0.5506700 | 0.5492928 | 0.0013772 | 0.5479156 | 40 |
| R2 | 0.2121268 | 0.2135547 | 0.2121268 | 0.0014279 | 0.2106989 | 40 |
| Intercept | 0.0000000 | 0.0000000 | -0.0065005 | 0.0065005 | -0.0065005 | 40 |
| Slope | 1.0000000 | 1.0000000 | 0.9947905 | 0.0052095 | 0.9947905 | 40 |
| Emax | 0.0000000 | 0.0000000 | 0.0023089 | 0.0023089 | 0.0023089 | 40 |
| D | 0.1318066 | 0.1328923 | 0.1318066 | 0.0010856 | 0.1307210 | 40 |
| U | -0.0001463 | -0.0001463 | -0.0000331 | -0.0001133 | -0.0000331 | 40 |
| Q | 0.1319530 | 0.1330386 | 0.1318397 | 0.0011989 | 0.1307541 | 40 |
| B | 0.1159272 | 0.1159854 | 0.1159503 | 0.0000350 | 0.1158922 | 40 |
| g | 1.3965451 | 1.4025119 | 1.3965451 | 0.0059668 | 1.3905783 | 40 |
| gp | 0.1457790 | 0.1464509 | 0.1457790 | 0.0006719 | 0.1451070 | 40 |
| c_index | 0.7746464 | 0.7753350 | 0.7746464 | 0.0006886 | 0.7739578 | 40 |
make_validation_tbl(lrm_val_all)
|
|
|
Optimism: (2) - (3) | Original (corrected): (1) - (4) | N | |
|---|---|---|---|---|---|---|
| Dxy | 0.7819128 | 0.7840229 | 0.7778674 | 0.0061555 | 0.7757573 | 40 |
| R2 | 0.4571773 | 0.4614948 | 0.4528691 | 0.0086258 | 0.4485516 | 40 |
| Intercept | 0.0000000 | 0.0000000 | -0.0201162 | 0.0201162 | -0.0201162 | 40 |
| Slope | 1.0000000 | 1.0000000 | 0.9766515 | 0.0233485 | 0.9766515 | 40 |
| Emax | 0.0000000 | 0.0000000 | 0.0086833 | 0.0086833 | 0.0086833 | 40 |
| D | 0.3095606 | 0.3125432 | 0.3061459 | 0.0063972 | 0.3031633 | 40 |
| U | -0.0001463 | -0.0001463 | 0.0000799 | -0.0002263 | 0.0000799 | 40 |
| Q | 0.3097069 | 0.3126895 | 0.3060660 | 0.0066235 | 0.3030834 | 40 |
| B | 0.0869979 | 0.0863768 | 0.0877119 | -0.0013351 | 0.0883330 | 40 |
| g | 2.4021654 | 2.4334329 | 2.3763771 | 0.0570558 | 2.3451096 | 40 |
| gp | 0.2053625 | 0.2057372 | 0.2044055 | 0.0013317 | 0.2040308 | 40 |
| c_index | 0.8909564 | 0.8920115 | 0.8889337 | 0.0030778 | 0.8878786 | 40 |
summarize_performance <- function(objects){
metrics <- c("Dxy", "R2", "B")
map_df(objects, function (x) x[metrics, "index.orig"],
.id = "Model") %>%
mutate(`C-index` = (Dxy + 1)/2) %>%
rename(`Brier score` = B, `R-Squared` = R2) %>%
select(-Dxy) %>%
kable(digits = 3) %>%
kable_styling()
}
list(lrm_val_age, lrm_val_d, lrm_val_dc, lrm_val_all) %>%
set_names(lrm_names) %>%
summarize_performance()
| Model | R-Squared | Brier score | C-index |
|---|---|---|---|
| Age only | 0.212 | 0.116 | 0.775 |
| All demographics | 0.229 | 0.114 | 0.785 |
| Demographics and comorbidities | 0.256 | 0.112 | 0.803 |
| All variables | 0.457 | 0.087 | 0.891 |
lrm_cal_age <- calibrate(lrm_fit_age, B = n_boot_val)
lrm_cal_d <- calibrate(lrm_fit_d, B = n_boot_val)
lrm_cal_dc <- calibrate(lrm_fit_dc, B = n_boot_val)
lrm_cal_all <- calibrate(lrm_fit_all, B = n_boot_val)
lrm_cal_list <- list(lrm_cal_age, lrm_cal_d, lrm_cal_dc, lrm_cal_all)
names(lrm_cal_list) <- lrm_names
plot_calibration <- function(object){
# Make tibble
cal_df <- map2(object, names(object), function(x, y){
x[, ] %>%
as_tibble() %>%
mutate(model = y)
}) %>%
bind_rows() %>%
mutate(model = factor(model, levels = lrm_names))
# Plot
breaks <- seq(0, 1, .2)
ggplot() +
geom_line(data = cal_df, mapping = aes(x = predy, y = calibrated.orig,
color = "Apparent")) +
geom_line(data = cal_df, mapping = aes(x = predy, y = calibrated.corrected,
color = "Bias-corrected")) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "grey") +
facet_wrap(~model) +
scale_x_continuous(breaks = breaks, limits = c(0, 1)) +
scale_y_continuous(breaks = breaks, limits = c(0, 1)) +
xlab("Predicted probability") +
ylab("Actual probability") +
scale_colour_manual(name = "",
values = c("Apparent" = "black",
"Bias-corrected" = "red")) +
theme(legend.position = "bottom")
}
plot_calibration(lrm_cal_list)